001    /*
002     * NeedlemanWunsch.java
003     *
004     * Copyright 2003 Sergio Anibal de Carvalho Junior
005     *
006     * This file is part of NeoBio.
007     *
008     * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009     * the GNU General Public License as published by the Free Software Foundation; either
010     * version 2 of the License, or (at your option) any later version.
011     *
012     * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014     * PURPOSE. See the GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License along with NeoBio;
017     * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018     * Boston, MA 02111-1307, USA.
019     *
020     * Proper attribution of the author as the source of the software would be appreciated.
021     *
022     * Sergio Anibal de Carvalho Junior             mailto:sergioanibaljr@users.sourceforge.net
023     * Department of Computer Science               http://www.dcs.kcl.ac.uk
024     * King's College London, UK                    http://www.kcl.ac.uk
025     *
026     * Please visit http://neobio.sourceforge.net
027     *
028     * This project was supervised by Professor Maxime Crochemore.
029     *
030     */
031    
032    package neobio.alignment;
033    
034    import java.io.Reader;
035    import java.io.IOException;
036    
037    /**
038     * This class implements the classic global alignment algorithm (with linear gap penalty
039     * function) due to S.B.Needleman and C.D.Wunsch (1970).
040     *
041     * <P>It is based on a dynamic programming approach. The idea consists of, given two
042     * sequences A and B of sizes n and m, respectively, building an (n+1 x m+1) matrix M that
043     * contains the similarity of prefixes of A and B. Every position M[i,j] in the matrix
044     * holds the score between the subsequences A[1..i] and B[1..j]. The first row and column
045     * represent alignments with spaces.</P>
046     *
047     * <P>Starting from row 0, column 0, the algorithm computes each position M[i,j] with the
048     * following recurrence:</P>
049     *
050     * <CODE><BLOCKQUOTE><PRE>
051     * M[0,0] = 0
052     * M[i,j] = max { M[i,j-1]   + scoreInsertion (B[j]),
053     *                M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
054     *                M[i-1,j]   + scoreDeletion(A[i])             }
055     * </PRE></BLOCKQUOTE></CODE>
056     *
057     * <P>In the end, the value at the last position (last row, last column) will contain
058     * the similarity between the two sequences. This part of the algorithm is accomplished
059     * by the {@link #computeMatrix computeMatrix} method. It has quadratic space complexity
060     * since it needs to keep an (n+1 x m+1) matrix in memory. And since the work of computing
061     * each cell is constant, it also has quadratic time complexity.</P>
062     *
063     * <P>After the matrix has been computed, the alignment can be retrieved by tracing a path
064     * back in the matrix from the last position to the first. This step is performed by
065     * the {@link #buildOptimalAlignment buildOptimalAlignment} method, and since the path can
066     * be roughly as long as (m + n), this method has O(n) time complexity.</P>
067     *
068     * <P>If the similarity value only is needed (and not the alignment itself), it is easy to
069     * reduce the space requirement to O(n) by keeping just the last row or column in memory.
070     * This is precisely what is done by the {@link #computeScore computeScore} method. Note
071     * that it still requires O(n<SUP>2</SUP>) time.</P>
072     *
073     * <P>For a more efficient approach to the global alignment problem, see the
074     * {@linkplain CrochemoreLandauZivUkelson} algorithm. For local alignment, see the
075     * {@linkplain SmithWaterman} algorithm.</P>
076     *
077     * @author Sergio A. de Carvalho Jr.
078     * @see SmithWaterman
079     * @see CrochemoreLandauZivUkelson
080     * @see CrochemoreLandauZivUkelsonLocalAlignment
081     * @see CrochemoreLandauZivUkelsonGlobalAlignment
082     */
083    public class NeedlemanWunsch extends PairwiseAlignmentAlgorithm
084    {
085            /**
086             * The first sequence of an alignment.
087             */
088            protected CharSequence seq1;
089    
090            /**
091             * The second sequence of an alignment.
092             */
093            protected CharSequence seq2;
094    
095            /**
096             * The dynamic programming matrix. Each position (i, j) represents the best score
097             * between the firsts i characters of <CODE>seq1</CODE> and j characters of
098             * <CODE>seq2</CODE>.
099             */
100            protected int[][] matrix;
101    
102            /**
103             * Loads sequences into {@linkplain CharSequence} instances. In case of any error,
104             * an exception is raised by the constructor of <CODE>CharSequence</CODE> (please
105             * check the specification of that class for specific requirements).
106             *
107             * @param input1 Input for first sequence
108             * @param input2 Input for second sequence
109             * @throws IOException If an I/O error occurs when reading the sequences
110             * @throws InvalidSequenceException If the sequences are not valid
111             * @see CharSequence
112             */
113            protected void loadSequencesInternal (Reader input1, Reader input2)
114                    throws IOException, InvalidSequenceException
115            {
116                    // load sequences into instances of CharSequence
117                    this.seq1 = new CharSequence(input1);
118                    this.seq2 = new CharSequence(input2);
119            }
120    
121            /**
122             * Frees pointers to loaded sequences and the dynamic programming matrix so that their
123             * data can be garbage collected.
124             */
125            protected void unloadSequencesInternal ()
126            {
127                    this.seq1 = null;
128                    this.seq2 = null;
129                    this.matrix = null;
130            }
131    
132            /**
133             * Builds an optimal global alignment between the loaded sequences after computing the
134             * dynamic programming matrix. It calls the <CODE>buildOptimalAlignment</CODE> method
135             * after the <CODE>computeMatrix</CODE> method computes the dynamic programming
136             * matrix.
137             *
138             * @return an optimal global alignment between the loaded sequences
139             * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
140             * with the loaded sequences.
141             * @see #computeMatrix
142             * @see #buildOptimalAlignment
143             */
144            protected PairwiseAlignment computePairwiseAlignment ()
145                    throws IncompatibleScoringSchemeException
146            {
147                    // compute the matrix
148                    computeMatrix ();
149    
150                    // build and return an optimal global alignment
151                    PairwiseAlignment alignment = buildOptimalAlignment ();
152    
153                    // allow the matrix to be garbage collected
154                    matrix = null;
155    
156                    return alignment;
157            }
158    
159            /**
160             * Computes the dynamic programming matrix.
161             *
162             * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
163             * with the loaded sequences.
164             */
165            protected void computeMatrix () throws IncompatibleScoringSchemeException
166            {
167                    int r, c, rows, cols, ins, del, sub;
168    
169                    rows = seq1.length()+1;
170                    cols = seq2.length()+1;
171    
172                    matrix = new int [rows][cols];
173    
174                    // initiate first row
175                    matrix[0][0] = 0;
176                    for (c = 1; c < cols; c++)
177                            matrix[0][c] = matrix[0][c-1] + scoreInsertion(seq2.charAt(c));
178    
179                    // calculates the similarity matrix (row-wise)
180                    for (r = 1; r < rows; r++)
181                    {
182                            // initiate first column
183                            matrix[r][0] = matrix[r-1][0] + scoreDeletion(seq1.charAt(r));
184    
185                            for (c = 1; c < cols; c++)
186                            {
187                                    ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c));
188                                    sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c));
189                                    del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r));
190    
191                                    // choose the greatest
192                                    matrix[r][c] = max (ins, sub, del);
193                            }
194                    }
195            }
196    
197            /**
198             * Builds an optimal global alignment between the loaded sequences. Before it is
199             * executed, the dynamic programming matrix must already have been computed by
200             * the <CODE>computeMatrix</CODE> method.
201             *
202             * @return an optimal global alignment between the loaded sequences
203             * @throws IncompatibleScoringSchemeException If the scoring scheme
204             * is not compatible with the loaded sequences.
205             * @see #computeMatrix
206             */
207            protected PairwiseAlignment buildOptimalAlignment ()
208                    throws IncompatibleScoringSchemeException
209            {
210                    StringBuffer    gapped_seq1, score_tag_line, gapped_seq2;
211                    int                             r, c, sub, max_score;
212    
213                    gapped_seq1     = new StringBuffer();
214                    score_tag_line  = new StringBuffer();
215                    gapped_seq2     = new StringBuffer();
216    
217                    // start at the last row, last column
218                    r = matrix.length - 1;
219                    c = matrix[r].length - 1;
220                    max_score = matrix[r][c];
221    
222                    while ((r > 0) || (c > 0))
223                    {
224                            if (c > 0)
225                                    if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c)))
226                                    {
227                                            // insertion was used
228                                            gapped_seq1.insert (0, GAP_CHARACTER);
229                                            score_tag_line.insert (0, GAP_TAG);
230                                            gapped_seq2.insert (0, seq2.charAt(c));
231                                            c = c - 1;
232    
233                                            // skip to the next iteration
234                                            continue;
235                                    }
236    
237                            if ((r > 0) && (c > 0))
238                            {
239                                    sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
240    
241                                    if (matrix[r][c] == matrix[r-1][c-1] + sub)
242                                    {
243                                            // substitution was used
244                                            gapped_seq1.insert (0, seq1.charAt(r));
245                                            if (seq1.charAt(r) == seq2.charAt(c))
246                                                    if (useMatchTag())
247                                                            score_tag_line.insert (0, MATCH_TAG);
248                                                    else
249                                                            score_tag_line.insert (0, seq1.charAt(r));
250                                            else if (sub > 0)
251                                                    score_tag_line.insert (0, APPROXIMATE_MATCH_TAG);
252                                            else
253                                                    score_tag_line.insert (0, MISMATCH_TAG);
254                                            gapped_seq2.insert (0, seq2.charAt(c));
255                                            r = r - 1; c = c - 1;
256    
257                                            // skip to the next iteration
258                                            continue;
259                                    }
260                            }
261    
262                            // must be a deletion
263                            gapped_seq1.insert (0, seq1.charAt(r));
264                            score_tag_line.insert (0, GAP_TAG);
265                            gapped_seq2.insert (0, GAP_CHARACTER);
266                            r = r - 1;
267                    }
268    
269                    return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(),
270                                                                                    gapped_seq2.toString(), max_score);
271            }
272    
273            /**
274             * Computes the score of the best global alignment between the two sequences using the
275             * scoring scheme previously set. This method calculates the similarity value only
276             * (doesn't build the whole matrix so the alignment cannot be recovered, however it
277             * has the advantage of requiring O(n) space only).
278             *
279             * @return score of the best global alignment between the loaded sequences
280             * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
281             * with the loaded sequences.
282             */
283            protected int computeScore () throws IncompatibleScoringSchemeException
284            {
285                    int[]   array;
286                    int             r, c, rows, cols, tmp, ins, del, sub;
287    
288                    rows = seq1.length()+1;
289                    cols = seq2.length()+1;
290    
291                    if (rows <= cols)
292                    {
293                            // goes columnwise
294                            array = new int [rows];
295    
296                            // initiate first column
297                            array[0] = 0;
298                            for (r = 1; r < rows; r++)
299                                    array[r] = array[r-1] + scoreDeletion(seq1.charAt(r));
300    
301                            // calculate the similarity matrix (keep current column only)
302                            for (c = 1; c < cols; c++)
303                            {
304                                    // initiate first row (tmp hold values
305                                    // that will be later moved to the array)
306                                    tmp = array[0] + scoreInsertion(seq2.charAt(c));
307    
308                                    for (r = 1; r < rows; r++)
309                                    {
310                                            ins = array[r] + scoreInsertion(seq2.charAt(c));
311                                            sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
312                                            del = tmp + scoreDeletion(seq1.charAt(r));
313    
314                                            // move the temp value to the array
315                                            array[r-1] = tmp;
316    
317                                            // choose the greatest
318                                            tmp = max (ins, sub, del);
319                                    }
320    
321                                    // move the temp value to the array
322                                    array[rows - 1] = tmp;
323                            }
324    
325                            return array[rows - 1];
326                    }
327                    else
328                    {
329                            // goes rowwise
330                            array = new int [cols];
331    
332                            // initiate first row
333                            array[0] = 0;
334                            for (c = 1; c < cols; c++)
335                                    array[c] = array[c-1] + scoreInsertion(seq2.charAt(c));
336    
337                            // calculate the similarity matrix (keep current row only)
338                            for (r = 1; r < rows; r++)
339                            {
340                                    // initiate first column (tmp hold values
341                                    // that will be later moved to the array)
342                                    tmp = array[0] + scoreDeletion(seq1.charAt(r));
343    
344                                    for (c = 1; c < cols; c++)
345                                    {
346                                            ins = tmp + scoreInsertion(seq2.charAt(c));
347                                            sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c));
348                                            del = array[c] + scoreDeletion(seq1.charAt(r));
349    
350                                            // move the temp value to the array
351                                            array[c-1] = tmp;
352    
353                                            // choose the greatest
354                                            tmp = max (ins, sub, del);
355                                    }
356    
357                                    // move the temp value to the array
358                                    array[cols - 1] = tmp;
359                            }
360    
361                            return array[cols - 1];
362                    }
363            }
364    }